# import all packages and set plots to be embedded inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
ford = pd.read_csv('201902-fordgobike-tripdata.csv')
# overview of data shape and composition
print(ford.shape)
(183412, 16)
ford.dtypes
duration_sec int64 start_time object end_time object start_station_id float64 start_station_name object start_station_latitude float64 start_station_longitude float64 end_station_id float64 end_station_name object end_station_latitude float64 end_station_longitude float64 bike_id int64 user_type object member_birth_year float64 member_gender object bike_share_for_all_trip object dtype: object
ford.columns
Index(['duration_sec', 'start_time', 'end_time', 'start_station_id',
'start_station_name', 'start_station_latitude',
'start_station_longitude', 'end_station_id', 'end_station_name',
'end_station_latitude', 'end_station_longitude', 'bike_id', 'user_type',
'member_birth_year', 'member_gender', 'bike_share_for_all_trip'],
dtype='object')
ford.head(10)
| duration_sec | start_time | end_time | start_station_id | start_station_name | start_station_latitude | start_station_longitude | end_station_id | end_station_name | end_station_latitude | end_station_longitude | bike_id | user_type | member_birth_year | member_gender | bike_share_for_all_trip | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 52185 | 2019-02-28 17:32:10.1450 | 2019-03-01 08:01:55.9750 | 21.0 | Montgomery St BART Station (Market St at 2nd St) | 37.789625 | -122.400811 | 13.0 | Commercial St at Montgomery St | 37.794231 | -122.402923 | 4902 | Customer | 1984.0 | Male | No |
| 1 | 42521 | 2019-02-28 18:53:21.7890 | 2019-03-01 06:42:03.0560 | 23.0 | The Embarcadero at Steuart St | 37.791464 | -122.391034 | 81.0 | Berry St at 4th St | 37.775880 | -122.393170 | 2535 | Customer | NaN | NaN | No |
| 2 | 61854 | 2019-02-28 12:13:13.2180 | 2019-03-01 05:24:08.1460 | 86.0 | Market St at Dolores St | 37.769305 | -122.426826 | 3.0 | Powell St BART Station (Market St at 4th St) | 37.786375 | -122.404904 | 5905 | Customer | 1972.0 | Male | No |
| 3 | 36490 | 2019-02-28 17:54:26.0100 | 2019-03-01 04:02:36.8420 | 375.0 | Grove St at Masonic Ave | 37.774836 | -122.446546 | 70.0 | Central Ave at Fell St | 37.773311 | -122.444293 | 6638 | Subscriber | 1989.0 | Other | No |
| 4 | 1585 | 2019-02-28 23:54:18.5490 | 2019-03-01 00:20:44.0740 | 7.0 | Frank H Ogawa Plaza | 37.804562 | -122.271738 | 222.0 | 10th Ave at E 15th St | 37.792714 | -122.248780 | 4898 | Subscriber | 1974.0 | Male | Yes |
| 5 | 1793 | 2019-02-28 23:49:58.6320 | 2019-03-01 00:19:51.7600 | 93.0 | 4th St at Mission Bay Blvd S | 37.770407 | -122.391198 | 323.0 | Broadway at Kearny | 37.798014 | -122.405950 | 5200 | Subscriber | 1959.0 | Male | No |
| 6 | 1147 | 2019-02-28 23:55:35.1040 | 2019-03-01 00:14:42.5880 | 300.0 | Palm St at Willow St | 37.317298 | -121.884995 | 312.0 | San Jose Diridon Station | 37.329732 | -121.901782 | 3803 | Subscriber | 1983.0 | Female | No |
| 7 | 1615 | 2019-02-28 23:41:06.7660 | 2019-03-01 00:08:02.7560 | 10.0 | Washington St at Kearny St | 37.795393 | -122.404770 | 127.0 | Valencia St at 21st St | 37.756708 | -122.421025 | 6329 | Subscriber | 1989.0 | Male | No |
| 8 | 1570 | 2019-02-28 23:41:48.7900 | 2019-03-01 00:07:59.7150 | 10.0 | Washington St at Kearny St | 37.795393 | -122.404770 | 127.0 | Valencia St at 21st St | 37.756708 | -122.421025 | 6548 | Subscriber | 1988.0 | Other | No |
| 9 | 1049 | 2019-02-28 23:49:47.6990 | 2019-03-01 00:07:17.0250 | 19.0 | Post St at Kearny St | 37.788975 | -122.403452 | 121.0 | Mission Playground | 37.759210 | -122.421339 | 6488 | Subscriber | 1992.0 | Male | No |
# descriptive statistics for numeric variables
ford.describe()
| duration_sec | start_station_id | start_station_latitude | start_station_longitude | end_station_id | end_station_latitude | end_station_longitude | bike_id | member_birth_year | |
|---|---|---|---|---|---|---|---|---|---|
| count | 183412.000000 | 183215.000000 | 183412.000000 | 183412.000000 | 183215.000000 | 183412.000000 | 183412.000000 | 183412.000000 | 175147.000000 |
| mean | 726.078435 | 138.590427 | 37.771223 | -122.352664 | 136.249123 | 37.771427 | -122.352250 | 4472.906375 | 1984.806437 |
| std | 1794.389780 | 111.778864 | 0.099581 | 0.117097 | 111.515131 | 0.099490 | 0.116673 | 1664.383394 | 10.116689 |
| min | 61.000000 | 3.000000 | 37.317298 | -122.453704 | 3.000000 | 37.317298 | -122.453704 | 11.000000 | 1878.000000 |
| 25% | 325.000000 | 47.000000 | 37.770083 | -122.412408 | 44.000000 | 37.770407 | -122.411726 | 3777.000000 | 1980.000000 |
| 50% | 514.000000 | 104.000000 | 37.780760 | -122.398285 | 100.000000 | 37.781010 | -122.398279 | 4958.000000 | 1987.000000 |
| 75% | 796.000000 | 239.000000 | 37.797280 | -122.286533 | 235.000000 | 37.797320 | -122.288045 | 5502.000000 | 1992.000000 |
| max | 85444.000000 | 398.000000 | 37.880222 | -121.874119 | 398.000000 | 37.880222 | -121.874119 | 6645.000000 | 2001.000000 |
ford.isnull().sum()
duration_sec 0 start_time 0 end_time 0 start_station_id 197 start_station_name 197 start_station_latitude 0 start_station_longitude 0 end_station_id 197 end_station_name 197 end_station_latitude 0 end_station_longitude 0 bike_id 0 user_type 0 member_birth_year 8265 member_gender 8265 bike_share_for_all_trip 0 dtype: int64
There are null values present in the data and will be dropped.
ford = ford.dropna(axis=0)
ford.isnull().sum()
ford = ford.reset_index(drop=True)
Start time and end_time are in the wrong datatype format of object and will be changed to datetime fornat
#check for duplicates
ford.duplicated().sum()
0
There are no duplicates in the data
# convert the datetime columns to the appropriate datatype
ford[["start_time", "end_time"]] = ford[["start_time", "end_time"]].apply(pd.to_datetime)
ford.dtypes
duration_sec int64 start_time datetime64[ns] end_time datetime64[ns] start_station_id float64 start_station_name object start_station_latitude float64 start_station_longitude float64 end_station_id float64 end_station_name object end_station_latitude float64 end_station_longitude float64 bike_id int64 user_type object member_birth_year float64 member_gender object bike_share_for_all_trip object dtype: object
# member birth year was changed to integer datatype as year cannot be floats
ford['member_birth_year'] = ford['member_birth_year'].astype('int64')
#check the description of the datetime columns
ford[["start_time", "end_time"]].describe()
| start_time | end_time | |
|---|---|---|
| count | 174952 | 174952 |
| unique | 174941 | 174939 |
| top | 2019-02-11 17:05:07.840000 | 2019-02-28 17:40:37.328000 |
| freq | 2 | 2 |
| first | 2019-02-01 00:00:20.636000 | 2019-02-01 00:04:52.058000 |
| last | 2019-02-28 23:59:18.548000 | 2019-03-01 08:01:55.975000 |
# engineer new columns to get time of day and day of the week for when each ride was started
# extract time of day and day of week as separate columns
ford['time_of_day'] = ford['start_time'].dt.strftime('%H:%M:%S')
ford['day_of_week'] = ford['start_time'].dt.strftime('%A')
ford['hour_of_day'] = ford['start_time'].dt.hour
ford['time_of_day'].value_counts()
17:13:06 17
08:48:09 16
17:36:09 16
08:52:43 16
08:28:59 16
..
20:56:27 1
21:01:53 1
21:01:08 1
06:59:05 1
00:00:20 1
Name: time_of_day, Length: 56941, dtype: int64
#calculate duration in minutes by dividing the duration in seconds by 60
ford['duration_mins'] = ford.iloc[:,0]/60
ford.dtypes
duration_sec int64 start_time datetime64[ns] end_time datetime64[ns] start_station_id float64 start_station_name object start_station_latitude float64 start_station_longitude float64 end_station_id float64 end_station_name object end_station_latitude float64 end_station_longitude float64 bike_id int64 user_type object member_birth_year int64 member_gender object bike_share_for_all_trip object time_of_day object day_of_week object hour_of_day int64 duration_mins float64 dtype: object
#extract member age from their birth year and create a new categorical variable assigning each age to an age group
# calculate age and add to the DataFrame by subtracting the member year from the year of the rides in the data
import datetime as dt
ride_year = pd.to_datetime(ford['start_time']).dt.year
ford['age'] = ride_year - ford['member_birth_year']
# create age bins and labels for each bin
bins = [0, 20, 30, 40, 50, 60, 100]
labels = ['<20', '20-29', '30-39', '40-49', '50-59', '60+']
# use pd.cut() to create the categorical variable
ford['age_group'] = pd.cut(ford['age'], bins=bins, labels=labels)
ford.head()
| duration_sec | start_time | end_time | start_station_id | start_station_name | start_station_latitude | start_station_longitude | end_station_id | end_station_name | end_station_latitude | ... | user_type | member_birth_year | member_gender | bike_share_for_all_trip | time_of_day | day_of_week | hour_of_day | duration_mins | age | age_group | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 52185 | 2019-02-28 17:32:10.145 | 2019-03-01 08:01:55.975 | 21.0 | Montgomery St BART Station (Market St at 2nd St) | 37.789625 | -122.400811 | 13.0 | Commercial St at Montgomery St | 37.794231 | ... | Customer | 1984 | Male | No | 17:32:10 | Thursday | 17 | 869.750000 | 35 | 30-39 |
| 1 | 61854 | 2019-02-28 12:13:13.218 | 2019-03-01 05:24:08.146 | 86.0 | Market St at Dolores St | 37.769305 | -122.426826 | 3.0 | Powell St BART Station (Market St at 4th St) | 37.786375 | ... | Customer | 1972 | Male | No | 12:13:13 | Thursday | 12 | 1030.900000 | 47 | 40-49 |
| 2 | 36490 | 2019-02-28 17:54:26.010 | 2019-03-01 04:02:36.842 | 375.0 | Grove St at Masonic Ave | 37.774836 | -122.446546 | 70.0 | Central Ave at Fell St | 37.773311 | ... | Subscriber | 1989 | Other | No | 17:54:26 | Thursday | 17 | 608.166667 | 30 | 20-29 |
| 3 | 1585 | 2019-02-28 23:54:18.549 | 2019-03-01 00:20:44.074 | 7.0 | Frank H Ogawa Plaza | 37.804562 | -122.271738 | 222.0 | 10th Ave at E 15th St | 37.792714 | ... | Subscriber | 1974 | Male | Yes | 23:54:18 | Thursday | 23 | 26.416667 | 45 | 40-49 |
| 4 | 1793 | 2019-02-28 23:49:58.632 | 2019-03-01 00:19:51.760 | 93.0 | 4th St at Mission Bay Blvd S | 37.770407 | -122.391198 | 323.0 | Broadway at Kearny | 37.798014 | ... | Subscriber | 1959 | Male | No | 23:49:58 | Thursday | 23 | 29.883333 | 60 | 50-59 |
5 rows × 22 columns
ford.dtypes
duration_sec int64 start_time datetime64[ns] end_time datetime64[ns] start_station_id float64 start_station_name object start_station_latitude float64 start_station_longitude float64 end_station_id float64 end_station_name object end_station_latitude float64 end_station_longitude float64 bike_id int64 user_type object member_birth_year int64 member_gender object bike_share_for_all_trip object time_of_day object day_of_week object hour_of_day int64 duration_mins float64 age int64 age_group category dtype: object
ford[["day_of_week"]].value_counts()
day_of_week Thursday 33712 Tuesday 30584 Wednesday 28426 Friday 27663 Monday 25641 Sunday 14512 Saturday 14414 dtype: int64
ford['age_group'].value_counts()
20-29 70054 30-39 63808 40-49 22024 50-59 11323 <20 4213 60+ 3458 Name: age_group, dtype: int64
print('user_type value counts:\n',ford['user_type'].value_counts())
print('\n')
print('member_gender value counts:\n',ford['member_gender'].value_counts())
print('\n')
print('bike_share_for_all_trip value counts:\n',ford['bike_share_for_all_trip'].value_counts())
user_type value counts: Subscriber 158386 Customer 16566 Name: user_type, dtype: int64 member_gender value counts: Male 130500 Female 40805 Other 3647 Name: member_gender, dtype: int64 bike_share_for_all_trip value counts: No 157606 Yes 17346 Name: bike_share_for_all_trip, dtype: int64
# convert user_type, member_gender, and bike_share_for_all_trip into ordered categorical types
ordinal_var_dict = {'user_type': ['Subscriber','Customer'],
'member_gender': ['Male', 'Female', 'Other'],
'bike_share_for_all_trip': ['Yes', 'No'],
'day_of_week': ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']}
for var in ordinal_var_dict:
ordered_var = pd.api.types.CategoricalDtype(ordered = True, categories = ordinal_var_dict[var])
ford[var] = ford[var].astype(ordered_var)
ford[["start_station_name", "end_station_name"]].value_counts()
start_station_name end_station_name
Berry St at 4th St San Francisco Ferry Building (Harry Bridges Plaza) 327
Grand Ave at Perkins St 19th Street BART Station 308
San Francisco Ferry Building (Harry Bridges Plaza) The Embarcadero at Sansome St 286
19th Street BART Station Grand Ave at Perkins St 283
The Embarcadero at Sansome St Steuart St at Market St 282
...
S Van Ness Ave at Market St The Embarcadero at Bryant St 1
Ellsworth St at Russell St Vine St at Shattuck Ave 1
Woolsey St at Sacramento St 1
Embarcadero BART Station (Beale St at Market St) 16th St Mission BART Station 2 1
Cesar Chavez St at Dolores St 14th St at Mission St 1
Length: 23303, dtype: int64
ford[["start_station_latitude", "start_station_longitude", "end_station_latitude", "end_station_longitude"]]
| start_station_latitude | start_station_longitude | end_station_latitude | end_station_longitude | |
|---|---|---|---|---|
| 0 | 37.789625 | -122.400811 | 37.794231 | -122.402923 |
| 1 | 37.769305 | -122.426826 | 37.786375 | -122.404904 |
| 2 | 37.774836 | -122.446546 | 37.773311 | -122.444293 |
| 3 | 37.804562 | -122.271738 | 37.792714 | -122.248780 |
| 4 | 37.770407 | -122.391198 | 37.798014 | -122.405950 |
| ... | ... | ... | ... | ... |
| 174947 | 37.788059 | -122.391865 | 37.788300 | -122.408531 |
| 174948 | 37.789625 | -122.400811 | 37.778742 | -122.392741 |
| 174949 | 37.331932 | -121.904888 | 37.333658 | -121.908586 |
| 174950 | 37.811351 | -122.273422 | 37.817827 | -122.275698 |
| 174951 | 37.789677 | -122.390428 | 37.785000 | -122.395936 |
174952 rows × 4 columns
from math import radians, sin, cos, sqrt, atan2
# define a function to calculate the Haversine distance between two points and create a distance column
def haversine(start_lat, start_lon, end_lat, end_lon):
# convert degrees to radians
start_lat, start_lon, end_lat, end_lon = map(radians, [start_lat, start_lon, end_lat, end_lon])
# Haversine formula
lat_dist = end_lat - start_lat
lon_dist = end_lon - start_lon
d = sin(lat_dist/2)**2 + cos(start_lat) * cos(end_lat) * sin(lon_dist/2)**2
r = 2 * atan2(sqrt(d), sqrt(1-d))
distance = 6371 * r # radius of the earth in km
return distance
# calculate the distance between the start and end stations and add a new column
ford['distance_km'] = ford.apply(lambda x: haversine(x['start_station_latitude'], x['start_station_longitude'], x['end_station_latitude'], x['end_station_longitude']), axis=1)
ford['distance_km'].value_counts()
0.000000 3458
1.035589 591
2.171534 493
1.349212 491
1.413548 474
...
1.355289 1
5.432732 1
3.883778 1
2.968655 1
2.890919 1
Name: distance_km, Length: 13760, dtype: int64
there are distances with 0km and this is because the ride occurred in the same neighborhood
ford[ford['distance_km']==0]
| duration_sec | start_time | end_time | start_station_id | start_station_name | start_station_latitude | start_station_longitude | end_station_id | end_station_name | end_station_latitude | ... | member_birth_year | member_gender | bike_share_for_all_trip | time_of_day | day_of_week | hour_of_day | duration_mins | age | age_group | distance_km | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 17 | 874 | 2019-02-28 23:43:05.183 | 2019-02-28 23:57:39.796 | 180.0 | Telegraph Ave at 23rd St | 37.812678 | -122.268773 | 180.0 | Telegraph Ave at 23rd St | 37.812678 | ... | 1978 | Male | No | 23:43:05 | Thursday | 23 | 14.566667 | 41 | 40-49 | 0.0 |
| 25 | 408 | 2019-02-28 23:48:08.282 | 2019-02-28 23:54:56.930 | 78.0 | Folsom St at 9th St | 37.773717 | -122.411647 | 78.0 | Folsom St at 9th St | 37.773717 | ... | 1982 | Male | No | 23:48:08 | Thursday | 23 | 6.800000 | 37 | 30-39 | 0.0 |
| 31 | 471 | 2019-02-28 23:42:43.361 | 2019-02-28 23:50:34.446 | 133.0 | Valencia St at 22nd St | 37.755213 | -122.420975 | 133.0 | Valencia St at 22nd St | 37.755213 | ... | 1992 | Male | No | 23:42:43 | Thursday | 23 | 7.850000 | 27 | 20-29 | 0.0 |
| 51 | 3478 | 2019-02-28 22:39:35.020 | 2019-02-28 23:37:33.342 | 11.0 | Davis St at Jackson St | 37.797280 | -122.398436 | 11.0 | Davis St at Jackson St | 37.797280 | ... | 1995 | Male | No | 22:39:35 | Thursday | 22 | 57.966667 | 24 | 20-29 | 0.0 |
| 52 | 3140 | 2019-02-28 22:44:53.503 | 2019-02-28 23:37:14.090 | 11.0 | Davis St at Jackson St | 37.797280 | -122.398436 | 11.0 | Davis St at Jackson St | 37.797280 | ... | 1983 | Female | No | 22:44:53 | Thursday | 22 | 52.333333 | 36 | 30-39 | 0.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 174863 | 1476 | 2019-02-01 02:45:04.744 | 2019-02-01 03:09:41.184 | 345.0 | Hubbell St at 16th St | 37.766483 | -122.398279 | 345.0 | Hubbell St at 16th St | 37.766483 | ... | 1967 | Male | No | 02:45:04 | Friday | 2 | 24.600000 | 52 | 50-59 | 0.0 |
| 174864 | 877 | 2019-02-01 02:53:15.995 | 2019-02-01 03:07:53.058 | 385.0 | Woolsey St at Sacramento St | 37.850578 | -122.278175 | 385.0 | Woolsey St at Sacramento St | 37.850578 | ... | 1987 | Male | No | 02:53:15 | Friday | 2 | 14.616667 | 32 | 30-39 | 0.0 |
| 174872 | 5713 | 2019-02-01 01:02:55.168 | 2019-02-01 02:38:09.002 | 31.0 | Raymond Kimbell Playground | 37.783813 | -122.434559 | 31.0 | Raymond Kimbell Playground | 37.783813 | ... | 1972 | Male | No | 01:02:55 | Friday | 1 | 95.216667 | 47 | 40-49 | 0.0 |
| 174895 | 874 | 2019-02-01 01:41:43.414 | 2019-02-01 01:56:17.552 | 253.0 | Haste St at College Ave | 37.866418 | -122.253799 | 253.0 | Haste St at College Ave | 37.866418 | ... | 1995 | Male | Yes | 01:41:43 | Friday | 1 | 14.566667 | 24 | 20-29 | 0.0 |
| 174921 | 943 | 2019-02-01 00:43:11.550 | 2019-02-01 00:58:55.217 | 31.0 | Raymond Kimbell Playground | 37.783813 | -122.434559 | 31.0 | Raymond Kimbell Playground | 37.783813 | ... | 1972 | Male | No | 00:43:11 | Friday | 0 | 15.716667 | 47 | 40-49 | 0.0 |
3458 rows × 23 columns
ford.head()
| duration_sec | start_time | end_time | start_station_id | start_station_name | start_station_latitude | start_station_longitude | end_station_id | end_station_name | end_station_latitude | ... | member_birth_year | member_gender | bike_share_for_all_trip | time_of_day | day_of_week | hour_of_day | duration_mins | age | age_group | distance_km | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 52185 | 2019-02-28 17:32:10.145 | 2019-03-01 08:01:55.975 | 21.0 | Montgomery St BART Station (Market St at 2nd St) | 37.789625 | -122.400811 | 13.0 | Commercial St at Montgomery St | 37.794231 | ... | 1984 | Male | No | 17:32:10 | Thursday | 17 | 869.750000 | 35 | 30-39 | 0.544709 |
| 1 | 61854 | 2019-02-28 12:13:13.218 | 2019-03-01 05:24:08.146 | 86.0 | Market St at Dolores St | 37.769305 | -122.426826 | 3.0 | Powell St BART Station (Market St at 4th St) | 37.786375 | ... | 1972 | Male | No | 12:13:13 | Thursday | 12 | 1030.900000 | 47 | 40-49 | 2.704545 |
| 2 | 36490 | 2019-02-28 17:54:26.010 | 2019-03-01 04:02:36.842 | 375.0 | Grove St at Masonic Ave | 37.774836 | -122.446546 | 70.0 | Central Ave at Fell St | 37.773311 | ... | 1989 | Other | No | 17:54:26 | Thursday | 17 | 608.166667 | 30 | 20-29 | 0.260739 |
| 3 | 1585 | 2019-02-28 23:54:18.549 | 2019-03-01 00:20:44.074 | 7.0 | Frank H Ogawa Plaza | 37.804562 | -122.271738 | 222.0 | 10th Ave at E 15th St | 37.792714 | ... | 1974 | Male | Yes | 23:54:18 | Thursday | 23 | 26.416667 | 45 | 40-49 | 2.409301 |
| 4 | 1793 | 2019-02-28 23:49:58.632 | 2019-03-01 00:19:51.760 | 93.0 | 4th St at Mission Bay Blvd S | 37.770407 | -122.391198 | 323.0 | Broadway at Kearny | 37.798014 | ... | 1959 | Male | No | 23:49:58 | Thursday | 23 | 29.883333 | 60 | 50-59 | 3.332203 |
5 rows × 23 columns
ford.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 174952 entries, 0 to 174951 Data columns (total 23 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 duration_sec 174952 non-null int64 1 start_time 174952 non-null datetime64[ns] 2 end_time 174952 non-null datetime64[ns] 3 start_station_id 174952 non-null float64 4 start_station_name 174952 non-null object 5 start_station_latitude 174952 non-null float64 6 start_station_longitude 174952 non-null float64 7 end_station_id 174952 non-null float64 8 end_station_name 174952 non-null object 9 end_station_latitude 174952 non-null float64 10 end_station_longitude 174952 non-null float64 11 bike_id 174952 non-null int64 12 user_type 174952 non-null category 13 member_birth_year 174952 non-null int64 14 member_gender 174952 non-null category 15 bike_share_for_all_trip 174952 non-null category 16 time_of_day 174952 non-null object 17 day_of_week 174952 non-null category 18 hour_of_day 174952 non-null int64 19 duration_mins 174952 non-null float64 20 age 174952 non-null int64 21 age_group 174880 non-null category 22 distance_km 174952 non-null float64 dtypes: category(5), datetime64[ns](2), float64(8), int64(5), object(3) memory usage: 24.9+ MB
ford[ford.age_group.isnull()]
| duration_sec | start_time | end_time | start_station_id | start_station_name | start_station_latitude | start_station_longitude | end_station_id | end_station_name | end_station_latitude | ... | member_birth_year | member_gender | bike_share_for_all_trip | time_of_day | day_of_week | hour_of_day | duration_mins | age | age_group | distance_km | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1238 | 148 | 2019-02-28 19:29:17.627 | 2019-02-28 19:31:45.967 | 158.0 | Shattuck Ave at Telegraph Ave | 37.833279 | -122.263490 | 173.0 | Shattuck Ave at 55th St | 37.840364 | ... | 1900 | Male | Yes | 19:29:17 | Thursday | 19 | 2.466667 | 119 | NaN | 0.792753 |
| 10407 | 1315 | 2019-02-27 19:21:34.436 | 2019-02-27 19:43:30.008 | 343.0 | Bryant St at 2nd St | 37.783172 | -122.393572 | 375.0 | Grove St at Masonic Ave | 37.774836 | ... | 1900 | Male | No | 19:21:34 | Wednesday | 19 | 21.916667 | 119 | NaN | 4.747020 |
| 15483 | 1131 | 2019-02-27 08:37:36.864 | 2019-02-27 08:56:28.022 | 375.0 | Grove St at Masonic Ave | 37.774836 | -122.446546 | 36.0 | Folsom St at 3rd St | 37.783830 | ... | 1900 | Male | No | 08:37:36 | Wednesday | 8 | 18.850000 | 119 | NaN | 4.307705 |
| 18665 | 641 | 2019-02-26 17:03:19.855 | 2019-02-26 17:14:01.619 | 9.0 | Broadway at Battery St | 37.798572 | -122.400869 | 30.0 | San Francisco Caltrain (Townsend St at 4th St) | 37.776598 | ... | 1900 | Male | No | 17:03:19 | Tuesday | 17 | 10.683333 | 119 | NaN | 2.492247 |
| 20650 | 1424 | 2019-02-26 08:58:02.904 | 2019-02-26 09:21:47.749 | 375.0 | Grove St at Masonic Ave | 37.774836 | -122.446546 | 343.0 | Bryant St at 2nd St | 37.783172 | ... | 1900 | Male | No | 08:58:02 | Tuesday | 8 | 23.733333 | 119 | NaN | 4.747020 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 164163 | 1368 | 2019-02-03 17:33:54.607 | 2019-02-03 17:56:42.949 | 37.0 | 2nd St at Folsom St | 37.785000 | -122.395936 | 375.0 | Grove St at Masonic Ave | 37.774836 | ... | 1900 | Male | No | 17:33:54 | Sunday | 17 | 22.800000 | 119 | NaN | 4.589196 |
| 165796 | 993 | 2019-02-03 09:45:30.464 | 2019-02-03 10:02:04.169 | 375.0 | Grove St at Masonic Ave | 37.774836 | -122.446546 | 36.0 | Folsom St at 3rd St | 37.783830 | ... | 1900 | Male | No | 09:45:30 | Sunday | 9 | 16.550000 | 119 | NaN | 4.307705 |
| 169534 | 1527 | 2019-02-01 19:09:28.387 | 2019-02-01 19:34:55.963 | 343.0 | Bryant St at 2nd St | 37.783172 | -122.393572 | 375.0 | Grove St at Masonic Ave | 37.774836 | ... | 1900 | Male | No | 19:09:28 | Friday | 19 | 25.450000 | 119 | NaN | 4.747020 |
| 169703 | 517 | 2019-02-01 18:38:40.471 | 2019-02-01 18:47:18.392 | 25.0 | Howard St at 2nd St | 37.787522 | -122.397405 | 30.0 | San Francisco Caltrain (Townsend St at 4th St) | 37.776598 | ... | 1902 | Female | No | 18:38:40 | Friday | 18 | 8.616667 | 117 | NaN | 1.228913 |
| 174402 | 428 | 2019-02-01 07:45:05.934 | 2019-02-01 07:52:14.922 | 284.0 | Yerba Buena Center for the Arts (Howard St at ... | 37.784872 | -122.400876 | 67.0 | San Francisco Caltrain Station 2 (Townsend St... | 37.776639 | ... | 1901 | Male | No | 07:45:05 | Friday | 7 | 7.133333 | 118 | NaN | 1.029114 |
72 rows × 23 columns
From the above, by setting the age group limit to 100, we can see outliers present in the data and these will be dropped
ford = ford.loc[-ford.age_group.isnull(),:]
ford[ford.age_group.isnull()]
| duration_sec | start_time | end_time | start_station_id | start_station_name | start_station_latitude | start_station_longitude | end_station_id | end_station_name | end_station_latitude | ... | member_birth_year | member_gender | bike_share_for_all_trip | time_of_day | day_of_week | hour_of_day | duration_mins | age | age_group | distance_km |
|---|
0 rows × 23 columns
ford.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 174880 entries, 0 to 174951 Data columns (total 23 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 duration_sec 174880 non-null int64 1 start_time 174880 non-null datetime64[ns] 2 end_time 174880 non-null datetime64[ns] 3 start_station_id 174880 non-null float64 4 start_station_name 174880 non-null object 5 start_station_latitude 174880 non-null float64 6 start_station_longitude 174880 non-null float64 7 end_station_id 174880 non-null float64 8 end_station_name 174880 non-null object 9 end_station_latitude 174880 non-null float64 10 end_station_longitude 174880 non-null float64 11 bike_id 174880 non-null int64 12 user_type 174880 non-null category 13 member_birth_year 174880 non-null int64 14 member_gender 174880 non-null category 15 bike_share_for_all_trip 174880 non-null category 16 time_of_day 174880 non-null object 17 day_of_week 174880 non-null category 18 hour_of_day 174880 non-null int64 19 duration_mins 174880 non-null float64 20 age 174880 non-null int64 21 age_group 174880 non-null category 22 distance_km 174880 non-null float64 dtypes: category(5), datetime64[ns](2), float64(8), int64(5), object(3) memory usage: 26.2+ MB
There are initially 183412 records of individual rides in the dataset with 16 features ('duration_sec', 'start_time', 'end_time', 'start_station_id','start_station_name', 'start_station_latitude','start_station_longitude', 'end_station_id', 'end_station_name','end_station_latitude', 'end_station_longitude', 'bike_id', 'user_type','member_birth_year', 'member_gender', 'bike_share_for_all_trip') which changed to 23 features after feature engineering adding four new columns ('age', 'age_group','time_of_day','day_of_week', 'duration_mins', 'hour_of_day', 'distance_km'). After dropping null values, there were 174880 records left
The time variables were changed into the appropriate datatype{datetime) and it can be seen that the time range is from 2019-02-01 00:00:20.636000 (start time for the first ride) and 2019-03-01 08:01:55.975000 (the end time for the last ride).
The user type, member gender and bike share for all trip are ordered factor variables with the following levels:
user_type: Subscriber, Customer
member_gender: Male, Female, Others
bike_share_for_all_trip: Yes, No
age_group column was created: '<18', '18-44', '45-64', '65+'
day_of_week:'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'
I am interested in uncovering the best features to understand the bike sharing system:
Time range, Age_groups, duration, user_type, gender, bike_share for all trip, the locations for the trips
The start and end time modified into days, weeks to undertand the variations of trips around these periods. It can be seen that all the rides took place in only one year (2019) and between february and march
The user_type, member_gender and bike_share_for_all trips are converted into categorical variable dtypes to understand the distribution of rides among these groups and whether or not shared rides have any difference to individual rides.
member ages were extracted from their birth years and then engineered a new categorical column for age groups
age groups were found to contain values greater than 100 and these were dropped from the data.
ford.dtypes
duration_sec int64 start_time datetime64[ns] end_time datetime64[ns] start_station_id float64 start_station_name object start_station_latitude float64 start_station_longitude float64 end_station_id float64 end_station_name object end_station_latitude float64 end_station_longitude float64 bike_id int64 user_type category member_birth_year int64 member_gender category bike_share_for_all_trip category time_of_day object day_of_week category hour_of_day int64 duration_mins float64 age int64 age_group category distance_km float64 dtype: object
Questions of interest (Univariate plots):
- What is the distribution of ride trips by user_type ?
- What is the distribution of ride trips by member_gender ?
- What is the distribution of ride trips by age_group ?
- What is the distribution of ride trips by duration_mins ?
- What is the distribution of ride trips by distance_km ?
- What is the distribution of ride trips by bike_share_for_all_trip ?
- What is the distribution of ride trips by hour_of_day ?
- What is the distribution of ride trips by day_of_week ?
- What is the distribution of start station latitude and longitude coordinates?
- What is the distribution of end station latitude and longitude coordinates?
default_color = sb.color_palette()[0]
plt.figure(figsize=(8,6))
sb.countplot('user_type',data=ford, color = default_color).set_title('Trips User Type');
It can be seen that a large proportion of users are subscribers
default_color = sb.color_palette()[0]
plt.figure(figsize=(8,6))
sb.countplot('member_gender',data=ford, color = default_color).set_title('Trips by member_gender');
The gender with the most ride trips is the male gender.
In line with this, I will investigate further to understand the average trip duration by gender
default_color = sb.color_palette()[0]
sb.countplot(data = ford, x = 'age_group', color = default_color)
plt.title('Trips by age_group');
The younger age groups have the most ride trips and it declines at the extremes of ages
We will investigate further to understand the average duration by age_group
#what is the description
ford.duration_mins.describe()
count 174880.000000 mean 11.733706 std 27.375248 min 1.016667 25% 5.383333 50% 8.500000 75% 13.150000 max 1409.133333 Name: duration_mins, dtype: float64
ford.duration_mins.sort_values(ascending=False)
81604 1409.133333
122163 1391.983333
107291 1390.116667
4987 1386.583333
91253 1375.200000
...
26013 1.016667
171377 1.016667
17901 1.016667
18863 1.016667
76584 1.016667
Name: duration_mins, Length: 174880, dtype: float64
# starting with a standard-scaled plot
binsize = 5
bins = np.arange(0, ford['duration_mins'].max()+binsize, binsize)
plt.figure(figsize=[8, 5])
plt.hist(data = ford, x = 'duration_mins', bins = bins)
plt.xlabel('Duration (minutes)')
plt.title('Distribution of trips by duration(mins)');
there is a right skew seen with majority of the ride trips within less than 200mins at the lower end.
we will want to investigate futher by first using an axis limit to zoom into the data
# change axis limit to 0-60mins
binsize = 1
bins = np.arange(0, ford['duration_mins'].max()+binsize, binsize)
plt.figure(figsize=[8, 5])
plt.hist(data = ford, x = 'duration_mins', bins = bins)
plt.xlabel('Duration (minutes)')
plt.xlim([0,60])
plt.title('Distribution of trips by duration(mins)');
By limiting the ride duration to 60mins, we can see majority of the duration in the dataset lies under 1hr
np.log10(ford['duration_mins']).describe()
count 174880.000000 mean 0.929676 std 0.305178 min 0.007179 25% 0.731051 50% 0.929419 75% 1.118926 max 3.148952 Name: duration_mins, dtype: float64
log_binsize = 0.025
bins = 10 ** np.arange(0, np.log10(ford['duration_mins'].max())+log_binsize, log_binsize)
plt.figure(figsize=[8, 5])
plt.hist(data = ford, x = 'duration_mins', bins = bins)
plt.xscale('log')
plt.xticks([1e0, 1e1, 1e2, 1e3], ['0', '1h', '2h', '3h'])
plt.xlabel('Duration (minutes)')
plt.title('Distribution of trips by duration(mins)');
The duration of most trips is between 0-2hrs and appears to have a normal distribution with few data points extending beyond 2hrs
sb.boxplot(ford['duration_mins'])
<AxesSubplot:xlabel='duration_mins'>
From the visualizations above we have safely established that the average duration is 11mins and typically less than 200mins hence we would deal with the outliers.
ford['distance_km'].mean()
1.6898908017879257
ford['duration_mins'].mean()
11.733705969808028
ford[['distance_km','duration_mins']]
| distance_km | duration_mins | |
|---|---|---|
| 0 | 0.544709 | 869.750000 |
| 1 | 2.704545 | 1030.900000 |
| 2 | 0.260739 | 608.166667 |
| 3 | 2.409301 | 26.416667 |
| 4 | 3.332203 | 29.883333 |
| ... | ... | ... |
| 174947 | 1.464766 | 8.000000 |
| 174948 | 1.402716 | 5.216667 |
| 174949 | 0.379066 | 2.350000 |
| 174950 | 0.747282 | 2.316667 |
| 174951 | 0.710395 | 4.516667 |
174880 rows × 2 columns
Apparently there seems to be a mismatch between duration and distance travelled as you expect longer trips to take more time but it can be roughly seen that lesser trips are taking more time in some cases and will be investigated further under bivariate plots
ford.distance_km.isna().sum()
0
#what is the description
ford.distance_km.describe()
count 174880.000000 mean 1.689891 std 1.096671 min 0.000000 25% 0.910443 50% 1.429829 75% 2.223913 max 69.469241 Name: distance_km, dtype: float64
# starting with a standard-scaled plot
binsize = 1
bins = np.arange(0, ford['distance_km'].max()+binsize, binsize)
plt.figure(figsize=[8, 5])
plt.hist(data = ford, x = 'distance_km', bins = bins)
plt.xlabel('Distance (km)')
plt.title('Distribution of trips by distance(km)');
The distance covered in most trips is apparently less than 10km and also has a right skew
#Use an xlim to zoom the data to where majority of the data points are
binsize = 0.2
bins = np.arange(0, ford['distance_km'].max()+binsize, binsize)
plt.figure(figsize=[8, 5])
plt.hist(data = ford, x = 'distance_km', bins = bins)
plt.xlim([0,10])
# Generate the x-ticks you want to apply
ticks = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
# Convert ticks into string values, to be displayed along the x-axis
labels = ['{}'.format(v) for v in ticks]
plt.xticks(ticks, labels);
plt.xlabel('Distance (km)')
plt.title('Distribution of trips by distance(km)');
#appears unimodal with peak around 1km and tails off to the right
np.log10(ford['distance_km']).describe()
count 1.748800e+05 mean -inf std NaN min -inf 25% -4.074714e-02 50% 1.552842e-01 75% 3.471177e-01 max 1.841793e+00 Name: distance_km, dtype: float64
The log of 0 will give infinity and as explained earlier, 0 as found in the data refers to rides that occurred within the same neighborhood
#apply log transformation
log_binsize = 0.025
bins = 10 ** np.arange(0, np.log10(ford['distance_km'].max())+log_binsize, log_binsize)
plt.figure(figsize=[8, 5])
plt.hist(data = ford, x = 'distance_km', bins = bins)
plt.xscale('log')
plt.xlabel('Duration (minutes)')
plt.title('Distribution of trips by distance(km)');
there is a steep cut because all distance values of 0 distances were shut off because of the log transformation
sb.boxplot(ford['distance_km'])
<AxesSubplot:xlabel='distance_km'>
from the box plot we can see one clear outlier in the data beyond 20km and this can be dropped.
#drop the outlier
ford = ford[ford['distance_km'] < 20]
ford.shape
(174879, 23)
sb.boxplot(ford['distance_km'])
<AxesSubplot:xlabel='distance_km'>
Now the plot looks better
ford['distance_km'].describe()
count 174879.000000 mean 1.689503 std 1.084631 min 0.000000 25% 0.910423 50% 1.429829 75% 2.223913 max 15.673955 Name: distance_km, dtype: float64
default_color = sb.color_palette()[0]
sb.countplot(data = ford, x = 'bike_share_for_all_trip', color = default_color)
plt.title('Trips by bike_share_for_all_trip');
A large proportion of the trips are not equally distributed
default_color = sb.color_palette()[0]
plt.figure(figsize=[10, 6])
sb.countplot(data = ford, x = 'hour_of_day', color = default_color)
plt.title('Trips by hour_of_day');
Appears to have two peaks, by 8am and 5pm which fairly corresponds to the beginning of the a typical work day and close of the work day
ford.day_of_week.value_counts()
Thursday 33705 Tuesday 30568 Wednesday 28415 Friday 27646 Monday 25630 Sunday 14504 Saturday 14411 Name: day_of_week, dtype: int64
default_color = sb.color_palette()[0]
plt.figure(figsize=[10, 6])
sb.countplot(data = ford, x = 'day_of_week', color = default_color,order=['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday'])
plt.title('Trips by day_of_week');
It can be seen that most trips occur during the weekdays and less on weekends.
# Resize the chart, and have two plots side-by-side
# set a larger figure size for subplots
plt.figure(figsize = [20, 5])
# histogram on left, example of too-large bin size
# 1 row, 2 cols, subplot 1
plt.subplot(1, 2, 1)
plt.hist(ford['start_station_latitude'], bins=20)
plt.xlabel('start_station_latitude')
plt.ylabel('Frequency')
plt.title('Distribution of Start station Latitude');
plt.subplot(1, 2, 2)
plt.hist(ford['start_station_longitude'], bins=20)
plt.xlabel('start_station_longitude')
plt.ylabel('Frequency')
plt.title('Distribution of Start station longitude');
# Resize the chart, and have two plots side-by-side
# set a larger figure size for subplots
plt.figure(figsize = [20, 5])
# histogram on left, example of too-large bin size
# 1 row, 2 cols, subplot 1
plt.subplot(1, 2, 1)
plt.hist(ford['end_station_latitude'], bins=20)
plt.xlabel('end_station_latitude')
plt.ylabel('Frequency')
plt.title('Distribution of End station Latitude');
plt.subplot(1, 2, 2)
plt.hist(ford['end_station_longitude'], bins=20)
plt.xlabel('end_station_longitude')
plt.ylabel('Frequency')
plt.title('Distribution of End station longitude');
Majority of the locations fall within a narrow spectrum of start & end latitudes and longitudes with two distinct regions for the latitude (between 37.3 and 37.4) and longitude (between -122.4 and -122.2 and -121.9)
Distribution of ride trips by user_type - It can be seen that a large proportion of users are subscribers
Distribution of ride trips by member_gender - The gender with the most ride trips is the male gender.
Distribution of ride trips by age_group - The working class age groups have the most ride trips and it declines at the extremes of ages
Distribution of ride trips by duration_mins - By limiting the ride duration to 60mins, we can see majority of the duration in the dataset lies under 1hr
Distribution of ride trips by distance_km - The distance covered in most trips is apparently less than 10km and also has a right skew
Distribution of ride trips by bike_share_for_all_trip - A large proportion of the trips are not equally distributed with more Nos
Distribution of ride trips by hour_of_day - Appears to have two peaks, by 8am and 5pm which fairly corresponds to the beginning of the a typical work day and close of the work day
Distribution of ride trips by day_of_week - It can be seen that most trips occur during the weekdays and less on weekends.
Distribution of start & end station latitude and longitude coordinates - Majority of the locations fall within a narrow spectrum of start & end latitudes and longitudes with two distinct regions for the latitude (between 37.3 and 37.4) and longitude (between -122.4 and -122.2 and -121.9)
Duration(mins) - had an unsual distribution with a right skew and i had to use an xlimit on the x-axis to zoom into the region of the data with the most datapoints and found that most ride durations last less than 1hr and upon log transformation, we can see a unimodal normal distribution!
Distance(km) - had an unusual distribution with a right skew and I had to use an xlimit on the x-axis to zoom into the region of the data with the most datapoints and appeared fairly unimodal with a right skew, and seems to follow the same distribution with duration except that log transformations for 0 distances were cut off. and I wonder the relationship between duration and distance.
To begin with, I want to look at the pairwise correlations present between features in the data.
#select numeric & categorical variables
num_vars = ford.select_dtypes(include=['int', 'float'])
cat_vars = list(ford.select_dtypes(include=['category']))
# the following columns were dropped because they have no effect on our data (id columns)
# and are derivatives/parents of other columns (member_birth_year, duration_sec)
# and are highly correlated to each other (start & end_station_latitude and longitudes)
num_vars = num_vars.drop(columns=['start_station_id', 'end_station_id', 'duration_sec','member_birth_year','bike_id', 'end_station_latitude', 'end_station_longitude'])
# correlation plot
plt.figure(figsize = [15, 8])
sb.heatmap(num_vars.corr(), annot = True, fmt = '.3f',cmap = 'vlag_r', center = 0)
plt.show()
As expected, there is a fairly strong correlation between start station latitudes and longtitudes but decreases with other variables. And general weak correlations across board. However, it is surprising to see that even distance and time also have a weak positive correlation.
# plot matrix: sample 500 diamonds so that plots are clearer and they render faster
print("ford.shape =",ford.shape)
ford_samp = ford.sample(n=500, replace = False)
print("ford_samp.shape =",ford_samp.shape)
g = sb.PairGrid(data = ford_samp, vars = num_vars)
g = g.map_diag(plt.hist, bins = 20);
g.map_offdiag(plt.scatter);
ford.shape = (174879, 23) ford_samp.shape = (500, 23)
It can be seen that there are similar distributions for start longitude and latitude for the other variables. There appears to be a huge proportion of distance covered within the age brackets of 20-40, and at distances below 10km. Age and duration appears to be clustered close to the y-axis, and age and hour of day appear to be clustered around the centre. Duration and distance show similar pattern to age and distance with visible outliers. Hour of day shows similar patterns with age, and distance but clustered to the y-axis for duration as majority of the ride trips did were less than 1hr and visible outliers can be seen.
plt.scatter(data=ford, x='duration_mins', y='distance_km')
plt.xlabel('duration_mins')
plt.ylabel('distance_km')
plt.title('Distance_km vs Duration_mins');
ford['distance_km'].corr(ford['duration_mins'])
0.14139010826419218
Looking at the parameters above it can be seen that there is some mismatch in the data as one would expect longer distances to take more time but that is not the case here as there seems to be an inverse relationship where shorter distances take even the most time. Although we have established earlier that distances within the same longitude and latitude is 0 hence will not be a good predictor here.
Now I want to investigate how duration and distance vary with the categorical varibles
# plot matrix of numeric features against categorical features.
# can use a larger sample since there are fewer plots and they're simpler in nature.
ford_samp = ford.sample(n=5000, replace = False)
def boxgrid(x, y, **kwargs):
""" Quick hack for creating box plots with seaborn's PairGrid. """
default_color = sb.color_palette()[0]
sb.boxplot(x, y, color = default_color)
plt.figure(figsize = [10, 10])
g = sb.PairGrid(data = ford_samp, y_vars = ['duration_mins', 'distance_km'], x_vars = cat_vars,
size = 6, aspect = 1)
g.map(boxgrid)
plt.show();
<Figure size 720x720 with 0 Axes>
From the visualization, it can be seen that there are quite a number of outliers across all categorical variables for distance as majority of the trips lie below 1hr which makes it difficult to tell the differences for each group. The subscibers in the univariate plot had a higher count however, customers cover more distances. Although we saw that the male gender was the most prominent by count, the distances covered by the genders did not vary much. Same goes for the bike share for all trips where the higher count was recorded for Yes but in terms of distance travelled, those not in the sharing scheme have a higher amount. The distribution of distance covered among age groups give a not so surprising information as majority of the rides fall within the working class group and the average ride distance for the 60+ group apperars to be nearly as the 30-39 and despite having a far lesser count in the univariate plots which can mean older people in this category take longer bike trips or there are large outliers.
We will take a closer look at the duration plots with the categorical variables
plt.figure(figsize = [20, 20])
default_color = sb.color_palette()[0]
# 2 row, 2 cols, subplot 1
plt.subplot(3, 2, 1)
plt.ylim([0,60])
sb.boxplot(x='bike_share_for_all_trip',y='duration_mins',data=ford, color = default_color);
# 2 row, 2 cols, subplot 2
plt.subplot(3, 2, 2)
plt.ylim([0,60])
sb.boxplot(x='user_type',y='duration_mins',data=ford, color = default_color)
plt.title('duration vs user_type');
# 2 row, 2 cols, subplot 3
plt.subplot(3, 2, 3)
plt.ylim([0,60])
sb.boxplot(x='member_gender',y='duration_mins',data=ford, color = default_color)
plt.title('duration vs member_gender');
# 2 row, 2 cols, subplot 4
plt.subplot(3, 2, 4)
plt.ylim([0,60])
sb.boxplot(x='age_group',y='duration_mins',data=ford, color = default_color)
plt.title('duration vs age_group');
# 2 row, 2 cols, subplot 5
plt.subplot(3, 2, 5)
plt.ylim([0,60])
sb.boxplot(x='day_of_week',y='duration_mins',data=ford, color = default_color, order=['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday'])
plt.title('duration vs age_group');
From the visualization above, distance travvelled did not seem to vary much among the different categories. Although there's an inverse relationship noticed in the distance travelled by those who share bikes for their trips who despite form the higher proportion in terms of count, actually have a lesser distance travelled and a similar pattern is seen for user_type and member gender where customers with lesser count travel more distances than subscribers. It could be the effect of outliers
We can investigate age further beyond using the age group, we can use their actual ages
plt.figure(figsize = [10, 6])
default_color = sb.color_palette()[0]
# plt.ylim([0,60])
plt.scatter(x='age',y='duration_mins',data=ford, color = default_color)
plt.xlabel('age(years)')
plt.ylabel('duration(mins)')
plt.title('Duration(mins) travelled by age');
As it can be seen, majority of the data lies in less than 80 age range as ouliers have been dropped earlier but there's still a few amount of outliers here between 80-100 and can actually be a result of incacurate data entry and we can remove these outliers as anyone beyond 80years of age should not be physically fit enough to ride a bike, besides, majority of duration travelled do not exceed 200mins, and it can be seen as outliers. It is also seen that younger people travel more hours which is expected.
plt.figure(figsize = [10, 6])
default_color = sb.color_palette()[0]
# plt.ylim([0,60])
sb.regplot('age',y='distance_km',data=ford, color = default_color)
plt.xlabel('age(years)')
plt.ylabel('distance_km)')
plt.title('Distance(km) travelled by age');
from the visualization above, it can be seen that younger people travel more distances than older people which follows a similar pattern for duration seen above
The striking observation thus far is in the relationship between distance and duration. One would expect a linear relationship and a strong positive correlation as the time taken is expected to increase as distance increases but that is not the case here as we see an inverse relationship.
For age, there were few outliers seen, as anyone above 80 is not really likely to still be riding, however it's an insignificant amount in the data so it may be expected. It is also seen that people of younger ages ride for longer hours compared to the older age group It is also seen that younger people travel more distances than older people and it is also expected For distribution across the categorical variables, there is a reverse relationship between the count and the level of utility, where despite more male riders, females appear to have longer distances travelled.Same is seen for user type, where ther are more subscribers, but customers seem to have longer time for travel and distance.
# create pivot table
duration_by_day_hour = ford.pivot_table(values='duration_mins', index='day_of_week', columns='hour_of_day', aggfunc=np.mean)
# Define order of days of the week
order_of_days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
# create heatmap
plt.figure(figsize=[12,8])
sb.heatmap(duration_by_day_hour, cmap='viridis_r', fmt='.0f', cbar_kws={'label': 'Average Trip Duration (minutes)'},annot=True)
plt.title('Average trip duration (mins) by day of week and hour of day')
plt.xlabel('Hour of Day')
plt.ylabel('Day of Week')
plt.show()
Here we can see that just as the univariate plot suggested that thursday is the day with most rides, it's also visible here too
# create pivot table
duration_by_type_gender_age = ford.pivot_table(values='duration_mins', index='user_type', columns=['member_gender', 'age_group'], aggfunc=np.mean)
# create heatmap
plt.figure(figsize=[12,8])
sb.heatmap(duration_by_type_gender_age, cmap='coolwarm', cbar_kws={'label': 'Average Trip Duration (minutes)'},annot=True)
plt.title('Average trip duration (mins) by user type, gender, and age group')
plt.xlabel('Gender and Age Group')
plt.ylabel('User Type')
plt.show()
We have seen in the bivariate plots that the female and other gender tend to have higher average trip durations compared to the male gender and customers are also higher in average trip duration compared to subscribers and this is seen clealy
# create scatter plot
plt.figure(figsize=[10,8])
plt.scatter(data=ford, x='start_station_longitude', y='start_station_latitude', c='duration_mins', cmap='viridis_r')
plt.colorbar(label='Trip Duration (minutes)')
plt.xlabel('Start Station Longitude')
plt.ylabel('Start Station Latitude')
plt.title('Trip Duration by Start Station Location')
plt.show()
We have seen in the univariate plots that our duration generally falls below 200mins and it's visible here by the heatmap and are generally across three major latitude/longitude locations
# create pivot table
trips_by_day_user = ford.pivot_table(values='duration_sec', index='day_of_week', columns='user_type', aggfunc='count')
# create stacked bar chart
plt.figure(figsize=[10,8])
trips_by_day_user.plot(kind='bar', stacked=True)
plt.xlabel('Day of Week')
plt.ylabel('Number of Trips')
plt.title('Number of Trips per User Type by Day of Week')
plt.show()
<Figure size 720x576 with 0 Axes>
The clustered bar chart here illustrates the number of trips per usertype and still hihglights the weekdays for more trips (thursday being the highest) and subscribers taking more trips compared to customers
ford.columns
Index(['duration_sec', 'start_time', 'end_time', 'start_station_id',
'start_station_name', 'start_station_latitude',
'start_station_longitude', 'end_station_id', 'end_station_name',
'end_station_latitude', 'end_station_longitude', 'bike_id', 'user_type',
'member_birth_year', 'member_gender', 'bike_share_for_all_trip',
'time_of_day', 'day_of_week', 'hour_of_day', 'duration_mins', 'age',
'age_group', 'distance_km'],
dtype='object')
import plotly.express as px
fig = px.scatter_mapbox(
ford, # Our DataFrame
lat=ford["start_station_latitude"],
lon=ford["start_station_longitude"],
width=800, # Width of map
height=600, # Height of map
color="user_type",
hover_data=["duration_mins"], # Display user_type when hovering mouse over house
)
fig.update_layout(mapbox_style="open-street-map")
fig.show()